library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.3.4     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(here)
## here() starts at /Users/hlynur/wd/ml_case_studies
library(glue)
## 
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
sisters <- read_csv(file = here("data", "sisters67.csv"))
## Parsed with column specification:
## cols(
##   .default = col_integer()
## )
## See spec(...) for full column specifications.
sisters <- sisters %>% 
  mutate(age = (v181 + 1) * 10) %>% 
  select(sister, age, glue("v{116:180}"))

sisters
sisters %>% 
  ggplot(mapping = aes(x = age)) +
  geom_histogram(binwidth = 10)
## Warning: Removed 5030 rows containing non-finite values (stat_bin).

tidy_sisters <- sisters %>% 
  select(-sister) %>% 
  gather(key, value, -age)

tidy_sisters %>% 
  group_by(age) %>% 
  summarize(medaltal = mean(value, na.rm = TRUE))
tidy_sisters %>% 
  count(value)
tidy_sisters %>%
    filter(complete.cases(.),
           key %in% glue("v{153:170}")) %>%
    group_by(key, value) %>%
    summarise(age = mean(age)) %>%
    ggplot(aes(value, age, color = key)) +
    geom_path(alpha = 0.5, size = 1.5) +
    geom_point(size = 2) +
    facet_wrap(~key) +
    guides(color = FALSE)

sisters_select <- sisters %>% 
  select(-sister) %>% 
  filter(complete.cases(.))

simple_lm <- lm(age ~.,
                data = sisters_select)

summary(simple_lm)
## 
## Call:
## lm(formula = age ~ ., data = sisters_select)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.040  -9.572  -1.120   8.946  63.563 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 28.7590627  0.5235690  54.929  < 2e-16 ***
## v116        -0.5987470  0.0386810 -15.479  < 2e-16 ***
## v117        -0.0565771  0.0440215  -1.285 0.198721    
## v118        -0.8286660  0.0420266 -19.718  < 2e-16 ***
## v119        -0.2928765  0.0412276  -7.104 1.22e-12 ***
## v120        -0.0414388  0.0373361  -1.110 0.267053    
## v121        -0.1015904  0.0387306  -2.623 0.008718 ** 
## v122         0.0366553  0.0403965   0.907 0.364205    
## v123        -1.0233578  0.0448046 -22.840  < 2e-16 ***
## v124         0.5335434  0.0430399  12.396  < 2e-16 ***
## v125         0.4809905  0.0368283  13.060  < 2e-16 ***
## v126         0.3458564  0.0422866   8.179 2.91e-16 ***
## v127        -0.0192932  0.0392643  -0.491 0.623168    
## v128         0.0009816  0.0377115   0.026 0.979234    
## v129         0.0803219  0.0386387   2.079 0.037639 *  
## v130         0.5897471  0.0398385  14.803  < 2e-16 ***
## v131        -0.5465149  0.0420680 -12.991  < 2e-16 ***
## v132         0.0988385  0.0459866   2.149 0.031614 *  
## v133        -0.5773012  0.0374372 -15.421  < 2e-16 ***
## v134        -0.1041877  0.0453769  -2.296 0.021676 *  
## v135         0.3732995  0.0542140   6.886 5.80e-12 ***
## v136        -0.1364003  0.0368229  -3.704 0.000212 ***
## v137         0.6211468  0.0505179  12.296  < 2e-16 ***
## v138         0.2985449  0.0523696   5.701 1.20e-08 ***
## v139         1.4131884  0.0520453  27.153  < 2e-16 ***
## v140        -0.7349810  0.0366384 -20.060  < 2e-16 ***
## v141         0.3995640  0.0462512   8.639  < 2e-16 ***
## v142        -0.2730768  0.0414278  -6.592 4.38e-11 ***
## v143         0.0755909  0.0417660   1.810 0.070320 .  
## v144         1.0629185  0.0536219  19.822  < 2e-16 ***
## v145        -0.6436678  0.0405920 -15.857  < 2e-16 ***
## v146        -0.4368924  0.0402487 -10.855  < 2e-16 ***
## v147        -0.4288801  0.0487510  -8.797  < 2e-16 ***
## v148         1.4320922  0.0452772  31.629  < 2e-16 ***
## v149        -0.2656667  0.0423536  -6.273 3.57e-10 ***
## v150        -0.2451097  0.0418002  -5.864 4.54e-09 ***
## v151         0.7228294  0.0443463  16.300  < 2e-16 ***
## v152         0.0565705  0.0405800   1.394 0.163307    
## v153        -0.6064906  0.0454401 -13.347  < 2e-16 ***
## v154         1.0710276  0.0387038  27.672  < 2e-16 ***
## v155         0.8692618  0.0490792  17.711  < 2e-16 ***
## v156         1.2491486  0.0454012  27.514  < 2e-16 ***
## v157         0.1566712  0.0410294   3.819 0.000134 ***
## v158        -0.0917289  0.0413971  -2.216 0.026706 *  
## v159        -0.2252867  0.0412120  -5.467 4.60e-08 ***
## v160         0.2028421  0.0425102   4.772 1.83e-06 ***
## v161         0.7971671  0.0433414  18.393  < 2e-16 ***
## v162        -0.2863248  0.0496361  -5.768 8.03e-09 ***
## v163        -1.3637209  0.0459544 -29.676  < 2e-16 ***
## v164        -0.3758315  0.0346484 -10.847  < 2e-16 ***
## v165        -0.4915162  0.0450373 -10.914  < 2e-16 ***
## v166         0.1195126  0.0417274   2.864 0.004183 ** 
## v167        -0.1726421  0.0406119  -4.251 2.13e-05 ***
## v168         0.3797206  0.0466675   8.137 4.12e-16 ***
## v169         1.4159979  0.0428306  33.060  < 2e-16 ***
## v170         1.2342274  0.0450740  27.382  < 2e-16 ***
## v171         0.8843819  0.0415874  21.266  < 2e-16 ***
## v172        -0.0140677  0.0356989  -0.394 0.693535    
## v173         0.7792050  0.0344868  22.594  < 2e-16 ***
## v174         0.2376283  0.0390072   6.092 1.12e-09 ***
## v175         0.1756163  0.0392362   4.476 7.62e-06 ***
## v176         0.8291804  0.0445449  18.614  < 2e-16 ***
## v177        -0.0007449  0.0476089  -0.016 0.987517    
## v178        -0.0239207  0.0411134  -0.582 0.560688    
## v179         0.2843894  0.0459248   6.193 5.95e-10 ***
## v180         0.0481848  0.0422827   1.140 0.254462    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.04 on 77046 degrees of freedom
## Multiple R-squared:  0.3386, Adjusted R-squared:  0.338 
## F-statistic: 606.8 on 65 and 77046 DF,  p-value: < 2.2e-16
set.seed(1234)
in_train <- createDataPartition(sisters_select$age, 
                                p = 0.6, 
                                list = FALSE)

training <- sisters_select[in_train, ]
validation_test <- sisters_select[-in_train, ]

in_test <- createDataPartition(validation_test$age,
                               p = 0.5,
                               list = FALSE)

testing <- validation_test[in_test, ]
validation <- validation_test[-in_test, ]
#CART
sisters_cart <- train(x = select(training, -age),
                      y = training$age,
                      method = "rpart")
## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning: Setting row names on a tibble is deprecated.
# ## xgboost
# sisters_rf <- train(age ~ ., method = "xgbLinear", data = training)
# 
# ## gbm
# sisters_gbm <- train(age ~ ., method = "gbm", data = training)
library(yardstick)
## Loading required package: broom
## 
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
## 
##     mnLogLoss, precision, recall
## The following object is masked from 'package:readr':
## 
##     spec
model_results <- validation %>% 
  mutate(CART = predict(sisters_cart, validation)) 

model_results %>% 
  metrics(truth = age, estimate = CART)